R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Problem Statement - 1:

Scrape tweets for the hashtag #datascience and do the following
- Hour wise or minute wise (depending upon the scrapped tweets) number of tweets - draw a line chart
- Top ten users with more number of tweets
- Create a new column in the data itself, to identity total number of hashtags in each tweet
- Identity those users who have used #datascience as well as #machinelearning. Plot a bar chart top 10 users based on their count

Deleting all variables

rm(list=ls())

Install and load libraries

library(tm)
library(dplyr)
library(stringi)
library(stringr)
library(wordcloud)
library(ggplot2)
library(plotly)
library(twitteR)
library(ROAuth)
library(httr)
library(base64enc)
library(httk)
library(RCurl)
library(zoo)

Twitter API code

serchHashtagTwitter <- function(hashtag) {
  api_key <- "XXXX"
  api_secret <- "XXXXX"
  token <- "XXXXX"
  token_secret <- "XXXXXX"
  
  httr::set_config( config( ssl_verifypeer = 0L ) )
  
  setup_twitter_oauth(
    consumer_key = api_key,
    consumer_secret = api_secret,
    access_token = token,
    access_secret = token_secret)
  
  # data.science.tweet <- searchTwitter("#datascience", 5000)
  data.science.tweet <- searchTwitter(hashtag, 5000)
  ## Convert the tweets to Dataframe
  #df.data.science.tweet <- twListToDF(data.science.tweet)
  twListToDF(data.science.tweet)
}

Code to read the tweets from CSV file which were downloaded from Twitter API and saved as csv

readCSVFile <- function(tweetfile.Path){
  df <- read.csv(
    tweetfile.Path,
    stringsAsFactors = F)
  #View(df)
  df$created <-  strptime(df$created,format="%Y-%m-%d %H:%M:%S", tz="GMT")
  df$created <- as.POSIXct(df$created)
  df
}

Scraping tweets with the hashtag #datascience.

Using ‘searchTwitter’ to get the twitter for #datascience

## Search the tweets with hashtag '#datascience'
# df.data.science.tweet <- serchHashtagTwitter("#datascience")

## Read twritter fron CSV file
df.data.science.tweet <- readCSVFile("/Users/ritesh/pad-datascience/R/unstructureData/data/datascience_tweet.csv")

#head(df.data.science.tweet)
#View(df.data.science.tweet)
dim(df.data.science.tweet)
## [1] 5000   17
# write.csv(df.data.science.tweet, file = "/Users/ritesh/pad-datascience/R/unstructureData/data/datascience_tweet.csv")

Part - A

Hour wise or minute wise (depending upon the scrapped tweets) number of tweets - draw a line chart

## Convert the text column to character
df.data.science.tweet$text <- as.character(df.data.science.tweet$text)
## removed the spacial character excluding '_' and '#'
df.data.science.tweet$text_transformed <- gsub("[^A-Za-z0-9///'/#/_ ]", " ", df.data.science.tweet$text)
## convert the the transformed text column to lower case
df.data.science.tweet$text_transformed <- lapply(df.data.science.tweet$text_transformed, tolower)

## Parse the date field 'created'
names(df.data.science.tweet)
##  [1] "X"                "text"             "favorited"       
##  [4] "favoriteCount"    "replyToSN"        "created"         
##  [7] "truncated"        "replyToSID"       "id"              
## [10] "replyToUID"       "statusSource"     "screenName"      
## [13] "retweetCount"     "isRetweet"        "retweeted"       
## [16] "longitude"        "latitude"         "text_transformed"
class(df.data.science.tweet$created)
## [1] "POSIXct" "POSIXt"
## Create a new date column which include date, hour and minute to plot the minute wise tweet count 
df.data.science.tweet$cnvdate <- format(df.data.science.tweet$created, "%d/%m/%y %H:%M") 

class(df.data.science.tweet$cnvdate)
## [1] "character"
# View(df.data.science.tweet)

## calculate the minute wise tweetcount
tweets_minutes <- df.data.science.tweet %>% group_by(cnvdate) %>% summarise(count=n())
tweets_minutes_subset <- head(tweets_minutes, 40)

minutesPlot <-
  ggplot(data=tweets_minutes_subset, aes(x=cnvdate, y=count, group=1)) +
  geom_line() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ggtitle("Tweets Count Vs Day", subtitle="Hour Wise Tweet Count") + 
  xlab("\nDay Wise Hours\n") +
  ylab("\nTweet Count")

# ## Dynamic line chart using 'plotly'
# ggplotly(minutesPlot)
## Static line chart 
plot(minutesPlot)

Part - B

Top ten users with more number of tweets

length(df.data.science.tweet$screenName)
## [1] 5000
length(unique(df.data.science.tweet$screenName))
## [1] 1932
## Group the data based on user and calculate the count for uses tweets
tweets_user <- df.data.science.tweet %>% group_by(screenName) %>% summarise(count=n())
tweets_user <- tweets_user %>% arrange(-count)
class(tweets_user)
## [1] "tbl_df"     "tbl"        "data.frame"
#View(tweets_user)
top_10 <- head(tweets_user, 10)
top_10
## # A tibble: 10 × 2
##         screenName count
##              <chr> <int>
## 1     alevergara78   369
## 2     chidambara09   126
## 3   fernandocuenca    99
## 4    ArkangelScrap    85
## 5      gp_pulipaka    73
## 6     MeeTMercurio    55
## 7  StartUpRealTime    54
## 8        IoTimelab    49
## 9        iamOlAOYE    42
## 10       DaveRubal    39

Part - C

Create a new column in the data itself, to identity total number of hashtags in each tweet

## Helper function for searching a pattern from a string
strcount <- function(x, pattern, split){
  unlist(lapply(
    strsplit(x, split),
       function(z) na.omit(length(grep(pattern, z)))
   ))
}

## Create new column 'hashTagCount' containg the count of hastags corresponding to each tweet
df.data.science.tweet$hashTagCount <- 
  lapply(df.data.science.tweet$text_transformed, FUN = function(x2) strcount(x2, "#\\S+", " "))

 #unique(df.data.science.tweet$hashTagCount)

Part - D

Identity those users who have used #datascience as well as #machinelearning. Plot a bar chart top 10 users based on their count

df.data.science.tweet$mlHashTagCount <- 
  lapply(
    df.data.science.tweet$text_transformed,
    FUN = function(x2) strcount(x2, "#machinelearning", " "))

df.data.science.tweet$dsHashTagCount <- 
  lapply(
    df.data.science.tweet$text_transformed,
    FUN = function(x2) strcount(x2, "#datascience", " "))

df.data.science.tweet$dsHashTagCount <- as.numeric(df.data.science.tweet$dsHashTagCount)
df.data.science.tweet$mlHashTagCount <- as.numeric(df.data.science.tweet$mlHashTagCount)

## Add a column to mark if tweets have both '#datascience' and '#machinelearnng'
df.data.science.tweet <- 
  transform(
    df.data.science.tweet, 
    DsMlHashTagCount = ifelse(
      (df.data.science.tweet$dsHashTagCount > 0 & df.data.science.tweet$mlHashTagCount > 0), 1, 0))

#unique(df.data.science.tweet$DsMlHashTagCount)

tweets_user_ds_ml <- 
  df.data.science.tweet %>% 
  group_by(screenName) %>% 
  summarise(totalCount = sum(DsMlHashTagCount, na.rm = T))
tweets_user_ds_ml <- tweets_user_ds_ml %>% arrange(-totalCount)

top_10_users <- head(tweets_user_ds_ml, 10)

## Bar plot of top 10 users have used '#datascience' and '#machinelearning'  
bar_chart_tweet_count <- 
  ggplot(top_10_users, aes(x = reorder(screenName,-totalCount), y = totalCount)) + 
  geom_bar(stat = 'identity') + 
  theme_light() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ggtitle("Users Vs Tweet Count", subtitle="Users Vs Tweet Count") + 
  xlab("\nMonth - Year") + 
  ylab("\nTweet Count")
ggplotly(bar_chart_tweet_count)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
plot(bar_chart_tweet_count)

Problem Statement - 2:

Scrape tweets from @narendramodi and do the following
- Create a word cloud using the hashtags used in his tweets. (If you are not able to scrape more tweets, please use the data set which was shared already)
- For each quarter identify top 5 hashtags based on frequency. Represent them using bar charts (Tip: use facets)

Read the tweets and create a dataframe

modi.tweets <- 
  read.csv(
    "/Users/ritesh/pad-datascience/R/unstructureData/data/narendramodi_tweets.csv",
    stringsAsFactors = F)

## Convert the text column to character
modi.tweets$text <- as.character(modi.tweets$text)
## removed the spacial character excluding '_' and '#'
modi.tweets$text_transformed <- gsub("[^A-Za-z0-9///'/#/_ ]", " ", modi.tweets$text)
# View(modi.tweets)

Part - A

Create a word cloud using the hashtags used in his tweets. (If you are not able to scrape more tweets, please use the data set which was shared already)

words_list <- str_split(modi.tweets$text_transformed, " ")
words <- unlist(words_list)
freq_words <- data.frame(table(words))
df.hashtag_words <- freq_words[grepl('#\\S+', as.character(freq_words$words)), ]
df.hashtag_words <- df.hashtag_words %>% arrange(-Freq)

top_20 <- df.hashtag_words

## Word cloud
wordcloud(
  top_20$words,
  top_20$Freq,  
  scale=c(4,.5), 
  random.order = T, 
  random.color = T,
  rot.per=.2,
  vfont=c("sans serif","plain"),
  colors=palette())
## Warning in wordcloud(top_20$words, top_20$Freq, scale = c(4, 0.5),
## random.order = T, : #Sandesh2Soldiers could not be fit on page. It will not
## be plotted.

Part - B

For each quarter identify top 5 hashtags based on frequency. Represent them using bar charts (Tip: use facets)

# View(modi.tweets)
## add quater as a seperate column
modi.tweets$qtr <- as.yearqtr(modi.tweets$created_at, format = "%Y-%m-%d")
#unique(modi.tweets$qtr)

#split to get words for each tweets
modi.tweets$words_list <- str_split(modi.tweets$text_transformed, " ")
# View(modi.tweets)

## Group the dataframe by quarter and combine the bag of words as list of vector of words
quarterly.GrpWord.List <- modi.tweets %>% 
  group_by(qtr) %>%
  summarise(words = list(words_list))
# View(quarterly.GrpWord.List)

## Unlist the list of words and filter out only hashtags and add that in the 'word' column
quarterly.GrpWord.List$words <- 
  lapply(lapply(quarterly.GrpWord.List$words, unlist), FUN = function(x2) x2[grepl('#\\S+', as.character(x2))])

## Convert the quarter column type from 'yearqtr' to 'character'
class(quarterly.GrpWord.List$qtr)
## [1] "yearqtr"
quarterly.GrpWord.List$qtr <- as.character(quarterly.GrpWord.List$qtr)

## Create a matrix of dataframe containing words and with its count for each quarter 
grp.df.matrix <- 
  mapply(
    FUN = function (x,y) {df <- data.frame(table(x)); df$qtr <- y; df}, 
    quarterly.GrpWord.List$words, quarterly.GrpWord.List$qtr)

## create a list dataframes  of top five hashtags for each quarters 
grp.df.list <- lapply(
  apply(grp.df.matrix,2, data.frame), 
  FUN = function(dflistElement) {df <- dflistElement %>% arrange(-Freq); head(df,5)})

## Combine all dataframe using rbind
top5.grp.df <- do.call("rbind", grp.df.list)
# View(top5.grp.df)

bar_chart_qtr_tweet_count <- 
  ggplot(top5.grp.df, aes(x = reorder(x,-Freq), y = Freq)) + 
  geom_bar(stat = 'identity') + 
  theme_light() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ggtitle("Quarterly Top 5 Hashtag Count", subtitle="Users Vs Tweet Count") + 
  xlab("\n Top-5 - Hashtags") + 
  ylab("\n Hashtag Count") +
  facet_wrap(~qtr, ncol = 5, scales = "free")
# ggplotly(bar_chart_qtr_tweet_count)
plot(bar_chart_qtr_tweet_count)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.